In [1]:
import pandas as pd

# Font: https://www.kaggle.com/datasets/uom190346a/sleep-health-and-lifestyle-dataset

file = "data/List of languages by total number of speakers.csv"
df = pd.read_csv(file)

df["First-language(L1) speakers"] = df["First-language(L1) speakers"].str.replace(" million", "")
df["First-language(L1) speakers"] = pd.to_numeric(df["First-language(L1) speakers"], errors='coerce')

df.head()
Out[1]:
Unnamed: 0 Language Family Branch First-language(L1) speakers Second-language(L2) speakers Total speakers(L1+L2)
0 0 English(excl. creole languages) Indo-European Germanic 372.9 1.080 billion[5] 1.452 billion
1 1 Mandarin Chinese(incl. Standard Chinese, but e... Sino-Tibetan Sinitic 929.0 198.7 million[6] 1.118 billion
2 2 Hindi(excl. Urdu) Indo-European Indo-Aryan 343.9 258.3 million[7] 602.2 million
3 3 Spanish Indo-European Romance 474.7 73.6 million[8] 548.3 million
4 4 French Indo-European Romance 79.9 194.2 million[9] 274.1 million
In [2]:
import plotly.express as px

# Creem un Treemap
fig = px.treemap(df, path=["Branch", "Family", "Language"], values="First-language(L1) speakers",
                 color="First-language(L1) speakers", color_continuous_scale=px.colors.sequential.Mint, 
                 title="Idiomes més parlats al mon (x milions de parlants com a llengua materna)")
fig.show()